In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
In [193]:
dta = pd.read_csv("cattle_corn_soybean_weather.csv",index_col = 0)
In [194]:
dta.head()
Out[194]:
Year Cattle production in lb Corn production in tons Soybeans production in bu STATE TMAX TMIN TAVG EMXT EMNT DX90 DT32 PRCP SNOW AWND
0 1988 671,520,000 160,000 14250000.0 AL 23.512941 10.138222 16.757727 37.796078 -10.460000 67.568627 63.200000 1286.194118 19.876190 3.30
1 1989 680,350,000 250,000 11970000.0 AL 22.942778 10.650000 16.725833 35.744444 -16.494000 44.814815 50.360000 1635.648624 4.766355 3.22
2 1990 641,115,000 200,000 7480000.0 AL 25.110615 11.219516 18.175161 38.084615 -7.325806 84.200000 38.048387 1443.063415 0.024000 3.34
3 1991 659,250,000 350,000 8050000.0 AL 23.856379 11.579630 17.725472 36.210345 -9.618519 60.551724 50.574074 1594.311304 1.595238 3.20
4 1992 525,310,000 325,000 7830000.0 AL 22.826000 10.168000 16.454600 35.670909 -8.516000 34.636364 51.780000 1496.424000 20.207207 3.14
In [195]:
dta['Cattle'] = dta['Cattle production in lb'].apply(lambda x: float(x.replace(",","")))
dta['Corn'] = dta['Corn production in tons'].apply(lambda x: float(x.replace(",","")) if type(x) == str else x)
dta['Soybean'] = dta['Soybeans production in bu'].apply(lambda x: x)
dta.drop(['Cattle production in lb','Corn production in tons','Soybeans production in bu'],axis=1,inplace=True)
In [196]:
dta.head()
Out[196]:
Year STATE TMAX TMIN TAVG EMXT EMNT DX90 DT32 PRCP SNOW AWND Cattle Corn Soybean
0 1988 AL 23.512941 10.138222 16.757727 37.796078 -10.460000 67.568627 63.200000 1286.194118 19.876190 3.30 671520000.0 160000.0 14250000.0
1 1989 AL 22.942778 10.650000 16.725833 35.744444 -16.494000 44.814815 50.360000 1635.648624 4.766355 3.22 680350000.0 250000.0 11970000.0
2 1990 AL 25.110615 11.219516 18.175161 38.084615 -7.325806 84.200000 38.048387 1443.063415 0.024000 3.34 641115000.0 200000.0 7480000.0
3 1991 AL 23.856379 11.579630 17.725472 36.210345 -9.618519 60.551724 50.574074 1594.311304 1.595238 3.20 659250000.0 350000.0 8050000.0
4 1992 AL 22.826000 10.168000 16.454600 35.670909 -8.516000 34.636364 51.780000 1496.424000 20.207207 3.14 525310000.0 325000.0 7830000.0

explore the relationship between each freature to cattle production

In [6]:
g = sns.lmplot(x="AWND", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("AWND", "Cattle")
Out[6]:
<seaborn.axisgrid.FacetGrid at 0x182cb2579e8>
In [7]:
g = sns.lmplot(x="SNOW", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("SNOW", "Cattle")
Out[7]:
<seaborn.axisgrid.FacetGrid at 0x182cd9e0f28>
In [8]:
g = sns.lmplot(x="PRCP", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("PRCP", "Cattle")
Out[8]:
<seaborn.axisgrid.FacetGrid at 0x182ce5d2080>
In [9]:
g = sns.lmplot(x="AWND", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("AWND", "Cattle")
Out[9]:
<seaborn.axisgrid.FacetGrid at 0x182cf642ef0>
In [10]:
g = sns.lmplot(x="DT32", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("DT32", "Cattle")
Out[10]:
<seaborn.axisgrid.FacetGrid at 0x182d01bbda0>
In [11]:
g = sns.lmplot(x="DX90", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("DX90", "Cattle")
Out[11]:
<seaborn.axisgrid.FacetGrid at 0x182d0761e10>
In [12]:
g = sns.lmplot(x="Corn", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("Corn", "Cattle")
Out[12]:
<seaborn.axisgrid.FacetGrid at 0x182d13b1e10>
In [13]:
g = sns.lmplot(x="Soybean", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("Soybean", "Cattle")
Out[13]:
<seaborn.axisgrid.FacetGrid at 0x182d1f971d0>
In [14]:
g = sns.lmplot(x="TMAX", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("TMAX", "Cattle")
Out[14]:
<seaborn.axisgrid.FacetGrid at 0x182d3a3de80>
In [15]:
g = sns.lmplot(x="TMIN", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("TMIN", "Cattle")
Out[15]:
<seaborn.axisgrid.FacetGrid at 0x182d462b470>
In [16]:
g = sns.lmplot(x="TAVG", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("TAVG", "Cattle")
Out[16]:
<seaborn.axisgrid.FacetGrid at 0x182d5019550>
In [17]:
g = sns.lmplot(x="SNOW", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("SNOW", "Cattle")
Out[17]:
<seaborn.axisgrid.FacetGrid at 0x182d624a3c8>
In [18]:
g = sns.lmplot(x="PRCP", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("PRCP", "Cattle")
Out[18]:
<seaborn.axisgrid.FacetGrid at 0x182d624a860>
In [19]:
g = sns.lmplot(x="EMXT", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("EMXT", "Cattle")
Out[19]:
<seaborn.axisgrid.FacetGrid at 0x182d741d668>
In [20]:
g = sns.lmplot(x="EMNT", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("EMNT", "Cattle")
Out[20]:
<seaborn.axisgrid.FacetGrid at 0x182d7f1aef0>
In [21]:
g = sns.lmplot(x="Year", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta)

# Use more informative axis labels than are provided by default
g.set_axis_labels("Year", "Cattle production in lb")
Out[21]:
<seaborn.axisgrid.FacetGrid at 0x182d7f33fd0>
In [22]:
dta.STATE.unique()
for name in dta.STATE.unique():
    g = sns.lmplot(x="Year", y="Cattle",
               truncate=True, height=5, data=dta[dta['STATE']==name])

# Use more informative axis labels than are provided by default
    g.set_axis_labels("Year", name)
    
C:\Users\Qiuyan\Miniconda3\lib\site-packages\matplotlib\pyplot.py:514: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

For each state, cattle production fluctuates differently with year.

In [23]:
sns.pairplot(dta.dropna(), hue="STATE")
Out[23]:
<seaborn.axisgrid.PairGrid at 0x182dd9e6ac8>
In [24]:
weather_subset = dta[['TMAX','TMIN','TAVG','EMNT','DX90','DT32','PRCP','SNOW','AWND',"STATE"]]
weather_subset.head()
Out[24]:
TMAX TMIN TAVG EMNT DX90 DT32 PRCP SNOW AWND STATE
0 23.512941 10.138222 16.757727 -10.460000 67.568627 63.200000 1286.194118 19.876190 3.30 AL
1 22.942778 10.650000 16.725833 -16.494000 44.814815 50.360000 1635.648624 4.766355 3.22 AL
2 25.110615 11.219516 18.175161 -7.325806 84.200000 38.048387 1443.063415 0.024000 3.34 AL
3 23.856379 11.579630 17.725472 -9.618519 60.551724 50.574074 1594.311304 1.595238 3.20 AL
4 22.826000 10.168000 16.454600 -8.516000 34.636364 51.780000 1496.424000 20.207207 3.14 AL
In [25]:
sns.pairplot(weather_subset.dropna())
Out[25]:
<seaborn.axisgrid.PairGrid at 0x182ea7dc9e8>
In [26]:
corr = weather_subset.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
           cmap = "coolwarm",
           annot = True)
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x182ee712a58>

TMAX, TMIN, EMNT, DX90 are correlated with TAVG, with correlation coef >= 0.73.

DT32 and SNOW has strong negative correlation with TAVG.

PRCP and AWND also correlates with temperature parameters, but to a less degree.

Calculate the correlation coef between cattle and features for each state

In [27]:
states = dta.STATE.unique()
d = {"state": states}
dta_cor = pd.DataFrame(d)
dta_noyear = dta.drop(['Year'], axis=1)

for colname in dta_noyear:
    if colname != 'STATE':
        dta_cor[colname] = dta_cor['state'].apply(lambda x: dta_noyear[dta_noyear['STATE']==x]['Cattle'].corr(dta_noyear[dta_noyear['STATE']==x][colname]))
In [28]:
dta_cor.set_index('state',inplace=True)
In [29]:
dta_cor.drop(['Cattle'],axis=1,inplace=True)
In [30]:
sns.set()

# Load the example flights dataset and conver to long-form

# Draw a heatmap with the numeric values in each cell
f, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(dta_cor, annot=True, cmap='coolwarm', linewidths=.5, ax=ax)
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x182ee626160>

Each feature has quite different correlation coef for different states

In [31]:
dta_scaled = dta.drop(['STATE','Year'],axis=1)
dta_scaled.head()
Out[31]:
TMAX TMIN TAVG EMXT EMNT DX90 DT32 PRCP SNOW AWND Cattle Corn Soybean
0 23.512941 10.138222 16.757727 37.796078 -10.460000 67.568627 63.200000 1286.194118 19.876190 3.30 671520000.0 160000.0 14250000.0
1 22.942778 10.650000 16.725833 35.744444 -16.494000 44.814815 50.360000 1635.648624 4.766355 3.22 680350000.0 250000.0 11970000.0
2 25.110615 11.219516 18.175161 38.084615 -7.325806 84.200000 38.048387 1443.063415 0.024000 3.34 641115000.0 200000.0 7480000.0
3 23.856379 11.579630 17.725472 36.210345 -9.618519 60.551724 50.574074 1594.311304 1.595238 3.20 659250000.0 350000.0 8050000.0
4 22.826000 10.168000 16.454600 35.670909 -8.516000 34.636364 51.780000 1496.424000 20.207207 3.14 525310000.0 325000.0 7830000.0
In [32]:
x = dta_scaled.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
dta_scaled = pd.DataFrame(x_scaled,columns = list(dta.drop(['STATE','Year'],axis=1)))
In [33]:
dta_scaled.head()
Out[33]:
TMAX TMIN TAVG EMXT EMNT DX90 DT32 PRCP SNOW AWND Cattle Corn Soybean
0 0.790195 0.661516 0.727256 0.679164 0.555320 0.483578 0.280455 0.592464 0.005615 0.122893 0.085607 0.007724 0.023348
1 0.768568 0.681461 0.726027 0.577665 0.440181 0.320732 0.223477 0.782254 0.001346 0.115742 0.086734 0.012486 0.019576
2 0.850797 0.703656 0.781868 0.693438 0.615126 0.602606 0.168843 0.677661 0.000007 0.126468 0.081727 0.009841 0.012149
3 0.803222 0.717691 0.764542 0.600714 0.571377 0.433359 0.224427 0.759804 0.000451 0.113955 0.084041 0.017777 0.013092
4 0.764138 0.662676 0.715576 0.574027 0.592415 0.247887 0.229778 0.706641 0.005708 0.108592 0.066947 0.016454 0.012728
In [34]:
dta_scaled['STATE'] = dta['STATE']
dta_scaled['Year'] = dta['Year']
In [35]:
dta_scaled.head()
Out[35]:
TMAX TMIN TAVG EMXT EMNT DX90 DT32 PRCP SNOW AWND Cattle Corn Soybean STATE Year
0 0.790195 0.661516 0.727256 0.679164 0.555320 0.483578 0.280455 0.592464 0.005615 0.122893 0.085607 0.007724 0.023348 AL 1988
1 0.768568 0.681461 0.726027 0.577665 0.440181 0.320732 0.223477 0.782254 0.001346 0.115742 0.086734 0.012486 0.019576 AL 1989
2 0.850797 0.703656 0.781868 0.693438 0.615126 0.602606 0.168843 0.677661 0.000007 0.126468 0.081727 0.009841 0.012149 AL 1990
3 0.803222 0.717691 0.764542 0.600714 0.571377 0.433359 0.224427 0.759804 0.000451 0.113955 0.084041 0.017777 0.013092 AL 1991
4 0.764138 0.662676 0.715576 0.574027 0.592415 0.247887 0.229778 0.706641 0.005708 0.108592 0.066947 0.016454 0.012728 AL 1992
In [36]:
g = sns.lmplot(x="AWND", y="Cattle", hue="STATE",
               truncate=True, height=15, data=dta_scaled)

# Use more informative axis labels than are provided by default
g.set_axis_labels("AWND", "Cattle")
Out[36]:
<seaborn.axisgrid.FacetGrid at 0x182efe14e48>

scale each feature by state

In [37]:
dta_scalebystate = dta.copy(deep=True)
dta_scalebystate.head()
Out[37]:
Year STATE TMAX TMIN TAVG EMXT EMNT DX90 DT32 PRCP SNOW AWND Cattle Corn Soybean
0 1988 AL 23.512941 10.138222 16.757727 37.796078 -10.460000 67.568627 63.200000 1286.194118 19.876190 3.30 671520000.0 160000.0 14250000.0
1 1989 AL 22.942778 10.650000 16.725833 35.744444 -16.494000 44.814815 50.360000 1635.648624 4.766355 3.22 680350000.0 250000.0 11970000.0
2 1990 AL 25.110615 11.219516 18.175161 38.084615 -7.325806 84.200000 38.048387 1443.063415 0.024000 3.34 641115000.0 200000.0 7480000.0
3 1991 AL 23.856379 11.579630 17.725472 36.210345 -9.618519 60.551724 50.574074 1594.311304 1.595238 3.20 659250000.0 350000.0 8050000.0
4 1992 AL 22.826000 10.168000 16.454600 35.670909 -8.516000 34.636364 51.780000 1496.424000 20.207207 3.14 525310000.0 325000.0 7830000.0
In [38]:
for colname in list(dta_scalebystate.drop(['STATE','Year'],axis=1)):
    dta_scalebystate[colname] = dta_scalebystate.groupby('STATE')[colname].apply(lambda x: (x-min(x))/(max(x)-min(x)))
In [39]:
dta_scalebystate.head()
Out[39]:
Year STATE TMAX TMIN TAVG EMXT EMNT DX90 DT32 PRCP SNOW AWND Cattle Corn Soybean
0 1988 AL 0.300681 0.061043 0.156056 0.666801 0.595192 0.589304 0.733080 0.368460 0.144514 0.967742 0.963360 0.377049 0.656770
1 1989 AL 0.051115 0.308477 0.139636 0.292656 0.000000 0.232877 0.358840 0.750808 0.034655 0.903226 1.000000 0.672131 0.525069
2 1990 AL 1.000000 0.583826 0.885779 0.719419 0.904348 0.849826 0.000000 0.540095 0.000174 1.000000 0.837196 0.508197 0.265712
3 1991 AL 0.451008 0.757933 0.654270 0.377620 0.678196 0.479388 0.365079 0.705580 0.011598 0.887097 0.912447 1.000000 0.298637
4 1992 AL 0.000000 0.075440 0.000000 0.279246 0.786948 0.073437 0.400228 0.598478 0.146920 0.838710 0.356670 0.918033 0.285929
In [40]:
dta.head()
Out[40]:
Year STATE TMAX TMIN TAVG EMXT EMNT DX90 DT32 PRCP SNOW AWND Cattle Corn Soybean
0 1988 AL 23.512941 10.138222 16.757727 37.796078 -10.460000 67.568627 63.200000 1286.194118 19.876190 3.30 671520000.0 160000.0 14250000.0
1 1989 AL 22.942778 10.650000 16.725833 35.744444 -16.494000 44.814815 50.360000 1635.648624 4.766355 3.22 680350000.0 250000.0 11970000.0
2 1990 AL 25.110615 11.219516 18.175161 38.084615 -7.325806 84.200000 38.048387 1443.063415 0.024000 3.34 641115000.0 200000.0 7480000.0
3 1991 AL 23.856379 11.579630 17.725472 36.210345 -9.618519 60.551724 50.574074 1594.311304 1.595238 3.20 659250000.0 350000.0 8050000.0
4 1992 AL 22.826000 10.168000 16.454600 35.670909 -8.516000 34.636364 51.780000 1496.424000 20.207207 3.14 525310000.0 325000.0 7830000.0
In [41]:
g = sns.lmplot(x="Soybean", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("Soybean", "Cattle")
Out[41]:
<seaborn.axisgrid.FacetGrid at 0x182ef8e1cc0>
In [42]:
g = sns.lmplot(x="Corn", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("Corn", "Cattle")
Out[42]:
<seaborn.axisgrid.FacetGrid at 0x182ef8e14e0>
In [43]:
g = sns.lmplot(x="AWND", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("AWND", "Cattle")
Out[43]:
<seaborn.axisgrid.FacetGrid at 0x182ef9072e8>
In [44]:
g = sns.lmplot(x="SNOW", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("SNOW", "Cattle")
Out[44]:
<seaborn.axisgrid.FacetGrid at 0x182ef9d04e0>
In [45]:
g = sns.lmplot(x="PRCP", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("PRCP", "Cattle")
Out[45]:
<seaborn.axisgrid.FacetGrid at 0x182efa31278>
In [46]:
g = sns.lmplot(x="DT32", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("DT32", "Cattle")
Out[46]:
<seaborn.axisgrid.FacetGrid at 0x182efa8c6d8>
In [47]:
g = sns.lmplot(x="DX90", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("DX90", "Cattle")
Out[47]:
<seaborn.axisgrid.FacetGrid at 0x182f495a550>
In [48]:
g = sns.lmplot(x="EMXT", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("EMXT", "Cattle")
Out[48]:
<seaborn.axisgrid.FacetGrid at 0x182f52a48d0>
In [49]:
g = sns.lmplot(x="EMNT", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("EMNT", "Cattle")
Out[49]:
<seaborn.axisgrid.FacetGrid at 0x182f57659e8>
In [50]:
g = sns.lmplot(x="TMAX", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("TMAX", "Cattle")
Out[50]:
<seaborn.axisgrid.FacetGrid at 0x182f65429e8>
In [51]:
g = sns.lmplot(x="TMIN", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("TMIN", "Cattle")
Out[51]:
<seaborn.axisgrid.FacetGrid at 0x182f498eda0>
In [52]:
g = sns.lmplot(x="TAVG", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("TAVG", "Cattle")
Out[52]:
<seaborn.axisgrid.FacetGrid at 0x182f730cbe0>
In [53]:
sns.set(style="whitegrid")

# Draw a scatter plot while assigning point colors and sizes to different
# variables in the dataset
f, ax = plt.subplots(figsize=(15,15))
sns.despine(f, left=True, bottom=True)
sns.scatterplot(x="Year", y="Cattle",
                hue="STATE", 
                linewidth=0,
                data=dta_scalebystate, ax=ax)
Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x182f80f5208>
In [54]:
sns.set(style="whitegrid")

# Draw a scatter plot while assigning point colors and sizes to different
# variables in the dataset
f, ax = plt.subplots(figsize=(15,15))
sns.despine(f, left=True, bottom=True)
sns.scatterplot(x="Year", y="TAVG",
                hue="STATE", 
                linewidth=0,
                data=dta_scalebystate, ax=ax)
Out[54]:
<matplotlib.axes._subplots.AxesSubplot at 0x182f733d3c8>
In [55]:
g = sns.lmplot(x="Year", y="Cattle", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("Year", "Cattle")
Out[55]:
<seaborn.axisgrid.FacetGrid at 0x182f87d56d8>
In [56]:
g = sns.lmplot(x="Year", y="TMAX", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("Year", "TMAX")
Out[56]:
<seaborn.axisgrid.FacetGrid at 0x182f8b4fc88>
In [57]:
g = sns.lmplot(x="Year", y="TMIN", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("Year", "TMIN")
Out[57]:
<seaborn.axisgrid.FacetGrid at 0x182f9c31160>
In [58]:
g = sns.lmplot(x="Year", y="SNOW", 
               truncate=True, height=15, data=dta_scalebystate)

# Use more informative axis labels than are provided by default
g.set_axis_labels("Year", "SNOW")
Out[58]:
<seaborn.axisgrid.FacetGrid at 0x182fa571b00>

modeling

linear regression model

In [197]:
sns.distplot(dta['Cattle'])
Out[197]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fb6a56a0>
In [198]:
sns.distplot(np.log(np.log(dta['Cattle'])))
Out[198]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fd4c0b38>
In [199]:
sns.distplot(np.log(dta['Cattle']))
Out[199]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fd5c8128>
In [200]:
sns.distplot(boxcox(dta['Cattle'],0.5))
Out[200]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fd533da0>
In [201]:
sns.distplot(dta_scalebystate['Cattle'])
Out[201]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fd6907b8>
In [64]:
sns.distplot(np.log(dta_scalebystate['Cattle']+1))
Out[64]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fb641c50>
In [65]:
sns.distplot(dta_scaled['Cattle'])
Out[65]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fb6c63c8>
In [66]:
sns.distplot(np.log(dta_scaled['Cattle']+1))
Out[66]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fb79cd30>
In [67]:
sns.distplot(dta_scalebystate['TAVG'])
Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fb75d710>
In [68]:
sns.distplot(dta['TAVG'])
Out[68]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fb8d8b70>
In [69]:
sns.distplot(dta.dropna()['Corn'])
Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fb8fbe48>
In [70]:
sns.distplot(dta_scalebystate.dropna()['Corn'])
Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fc9bc470>
In [203]:
dta.columns
dta.head()
Out[203]:
Year STATE TMAX TMIN TAVG EMXT EMNT DX90 DT32 PRCP SNOW AWND Cattle Corn Soybean
0 1988 AL 23.512941 10.138222 16.757727 37.796078 -10.460000 67.568627 63.200000 1286.194118 19.876190 3.30 671520000.0 160000.0 14250000.0
1 1989 AL 22.942778 10.650000 16.725833 35.744444 -16.494000 44.814815 50.360000 1635.648624 4.766355 3.22 680350000.0 250000.0 11970000.0
2 1990 AL 25.110615 11.219516 18.175161 38.084615 -7.325806 84.200000 38.048387 1443.063415 0.024000 3.34 641115000.0 200000.0 7480000.0
3 1991 AL 23.856379 11.579630 17.725472 36.210345 -9.618519 60.551724 50.574074 1594.311304 1.595238 3.20 659250000.0 350000.0 8050000.0
4 1992 AL 22.826000 10.168000 16.454600 35.670909 -8.516000 34.636364 51.780000 1496.424000 20.207207 3.14 525310000.0 325000.0 7830000.0
In [204]:
# will ignore the STATE and Year first
x = dta.dropna()[['TMAX', 'TMIN', 'TAVG', 'EMXT', 'EMNT', 'DX90', 'DT32',
       'PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean']]
y = dta.dropna()['Cattle']
In [205]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.4, random_state = None)
In [206]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(xTrain,yTrain)
print(lm.intercept_)
print(lm.coef_)
cdf = pd.DataFrame(lm.coef_, x.columns, columns=['coef'])
print(cdf)
predictions = lm.predict(xTest)
plt.scatter(yTest,predictions)
sns.distplot(yTest-predictions)
-11617759317.198061
[ 2.21711337e+09  1.29927086e+09 -3.22046361e+09 -2.35761736e+07
  1.11965457e+07  1.50321242e+06  1.32226652e+07 -5.22731573e+05
  1.09267582e+05  8.12346675e+08  1.61751093e+02  1.20382309e+00]
                 coef
TMAX     2.217113e+09
TMIN     1.299271e+09
TAVG    -3.220464e+09
EMXT    -2.357617e+07
EMNT     1.119655e+07
DX90     1.503212e+06
DT32     1.322267e+07
PRCP    -5.227316e+05
SNOW     1.092676e+05
AWND     8.123467e+08
Corn     1.617511e+02
Soybean  1.203823e+00
Out[206]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fd769b00>
In [207]:
from sklearn import metrics
metrics.mean_absolute_error(yTest,predictions)
metrics.r2_score(yTest,predictions)
Out[207]:
0.6140506732670812
In [208]:
predictions2 = lm.predict(xTrain)
metrics.r2_score(yTrain,predictions2)
Out[208]:
0.5807489334260857
In [77]:
### predictions
In [78]:
### include regularization
from sklearn import linear_model
reg = linear_model.Lasso(alpha=1)
reg.fit(xTrain,yTrain)
C:\Users\Qiuyan\Miniconda3\lib\site-packages\sklearn\linear_model\coordinate_descent.py:492: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Out[78]:
Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
In [79]:
reg.coef_
Out[79]:
array([ 6.97400084e+08, -3.44600826e+08, -8.88342968e+07, -5.05793975e+07,
        1.43491703e+07,  3.33164619e+06,  8.11373351e+06, -6.31117693e+05,
        2.40951429e+05,  9.25623018e+08,  1.60769973e+02,  1.48716020e+00])
In [80]:
predictions = reg.predict(xTrain)
print(metrics.r2_score(yTrain,predictions))

predictions = reg.predict(xTest)
metrics.r2_score(yTest,predictions)
0.5972372455269135
Out[80]:
0.5803012221605319
In [81]:
plt.scatter(np.log(yTest),np.log(predictions))
#sns.distplot(yTest-predictions)
C:\Users\Qiuyan\Miniconda3\lib\site-packages\ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in log
  """Entry point for launching an IPython kernel.
Out[81]:
<matplotlib.collections.PathCollection at 0x182fcc404a8>
In [82]:
sns.distplot(yTest-predictions)
Out[82]:
<matplotlib.axes._subplots.AxesSubplot at 0x182fcc7f5c0>
In [209]:
dta.columns
dta.head()
Out[209]:
Year STATE TMAX TMIN TAVG EMXT EMNT DX90 DT32 PRCP SNOW AWND Cattle Corn Soybean
0 1988 AL 23.512941 10.138222 16.757727 37.796078 -10.460000 67.568627 63.200000 1286.194118 19.876190 3.30 671520000.0 160000.0 14250000.0
1 1989 AL 22.942778 10.650000 16.725833 35.744444 -16.494000 44.814815 50.360000 1635.648624 4.766355 3.22 680350000.0 250000.0 11970000.0
2 1990 AL 25.110615 11.219516 18.175161 38.084615 -7.325806 84.200000 38.048387 1443.063415 0.024000 3.34 641115000.0 200000.0 7480000.0
3 1991 AL 23.856379 11.579630 17.725472 36.210345 -9.618519 60.551724 50.574074 1594.311304 1.595238 3.20 659250000.0 350000.0 8050000.0
4 1992 AL 22.826000 10.168000 16.454600 35.670909 -8.516000 34.636364 51.780000 1496.424000 20.207207 3.14 525310000.0 325000.0 7830000.0
In [287]:
dta.boxplot(column='Cattle',by='STATE',figsize = (20,8))
Out[287]:
<matplotlib.axes._subplots.AxesSubplot at 0x182ffb77160>
In [85]:
dta_selected = dta[['Year','STATE','TAVG','PRCP','SNOW','AWND','Corn','Soybean','Cattle']]
In [86]:
dta_selected.columns
Out[86]:
Index(['Year', 'STATE', 'TAVG', 'PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean',
       'Cattle'],
      dtype='object')
In [211]:
x = dta.dropna()[['TAVG','STATE','PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean']]
y = dta.dropna()['Cattle']
In [212]:
x.shape
Out[212]:
(907, 7)
In [89]:
x.STATE.unique()
Out[89]:
array(['AL', 'AR', 'DE', 'FL', 'GA', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA',
       'MD', 'MI', 'MN', 'MS', 'MO', 'NE', 'NJ', 'NY', 'NC', 'ND', 'OH',
       'OK', 'PA', 'SC', 'SD', 'TN', 'TX', 'VA', 'WV', 'WI'], dtype=object)
In [90]:
dta_selected.dropna(inplace=True)
C:\Users\Qiuyan\Miniconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
In [91]:
dta_selected.shape
Out[91]:
(907, 9)
In [92]:
dta_selected.STATE.unique()
Out[92]:
array(['AL', 'AR', 'DE', 'FL', 'GA', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA',
       'MD', 'MI', 'MN', 'MS', 'MO', 'NE', 'NJ', 'NY', 'NC', 'ND', 'OH',
       'OK', 'PA', 'SC', 'SD', 'TN', 'TX', 'VA', 'WV', 'WI'], dtype=object)
In [93]:
train_r2 = []
test_r2 = []
for state in dta_selected.STATE.unique():
    tmp = dta_selected[dta_selected['STATE']==state]
    x = tmp[['TAVG','PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean']]
    y = tmp['Cattle']
    xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.4, random_state = None)
    lm = LinearRegression()
    lm.fit(xTrain,yTrain)
    prediction1 = lm.predict(xTrain)
    prediction2 = lm.predict(xTest)
    train_r2.append(metrics.r2_score(yTrain,prediction1))
    test_r2.append(metrics.r2_score(yTest,prediction2))
    
In [94]:
states = [state for state in dta_selected.STATE.unique()]
In [95]:
dt = {}
dt['STATE'] = states
dt['train_r2'] = train_r2
dt['test_r2'] = test_r2
ddf = pd.DataFrame(dt)
In [257]:
x = dta.dropna()[['STATE', 'TAVG', 'EMXT', 'EMNT', 'DX90', 'DT32',
       'PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean']]
#x = dta.dropna()[['TMAX', 'TMIN', 'TAVG', 'EMXT', 'EMNT', 'DX90', 'DT32',
#       'PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean']]
#x = dta.dropna()[[ 'TAVG','PRCP', 'SNOW', 'AWND', 'Corn', 'Soybean','STATE']]
y = dta.dropna()['Cattle']
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.4, random_state = None)
In [218]:
lm = LinearRegression()
lm.fit(xTrain,yTrain)
y_pred = lm.predict(xTest)
print(metrics.r2_score(yTest,y_pred))
y_pred_train = lm.predict(xTrain)
print(metrics.r2_score(yTrain,y_pred_train))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-218-1f1d2c186d04> in <module>
      1 lm = LinearRegression()
----> 2 lm.fit(xTrain,yTrain)
      3 y_pred = lm.predict(xTest)
      4 print(metrics.r2_score(yTest,y_pred))
      5 y_pred_train = lm.predict(xTrain)

~\Miniconda3\lib\site-packages\sklearn\linear_model\base.py in fit(self, X, y, sample_weight)
    456         n_jobs_ = self.n_jobs
    457         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
--> 458                          y_numeric=True, multi_output=True)
    459 
    460         if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:

~\Miniconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    754                     ensure_min_features=ensure_min_features,
    755                     warn_on_dtype=warn_on_dtype,
--> 756                     estimator=estimator)
    757     if multi_output:
    758         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,

~\Miniconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    565         # make sure we actually converted to numeric:
    566         if dtype_numeric and array.dtype.kind == "O":
--> 567             array = array.astype(np.float64)
    568         if not allow_nd and array.ndim >= 3:
    569             raise ValueError("Found array with dim %d. %s expected <= 2."

ValueError: could not convert string to float: 'NC'
In [258]:
x['STATE'].unique()
Out[258]:
array(['AL', 'AR', 'DE', 'FL', 'GA', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA',
       'MD', 'MI', 'MN', 'MS', 'MO', 'NE', 'NJ', 'NY', 'NC', 'ND', 'OH',
       'OK', 'PA', 'SC', 'SD', 'TN', 'TX', 'VA', 'WV', 'WI'], dtype=object)
In [259]:
state_list = x['STATE'].unique()
state_list
Out[259]:
array(['AL', 'AR', 'DE', 'FL', 'GA', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA',
       'MD', 'MI', 'MN', 'MS', 'MO', 'NE', 'NJ', 'NY', 'NC', 'ND', 'OH',
       'OK', 'PA', 'SC', 'SD', 'TN', 'TX', 'VA', 'WV', 'WI'], dtype=object)
In [260]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
x['STATE'] = labelencoder.fit_transform(x['STATE'])
In [261]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.4, random_state = None)
lm = LinearRegression()
lm.fit(xTrain,yTrain)
y_pred = lm.predict(xTest)
print(metrics.r2_score(yTest,y_pred))
y_pred_train = lm.predict(xTrain)
print(metrics.r2_score(yTrain,y_pred_train))
0.5298497644269173
0.6000975822555786
In [281]:
plt.scatter(np.log(yTest),np.log(y_pred))
C:\Users\Qiuyan\Miniconda3\lib\site-packages\ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in log
  """Entry point for launching an IPython kernel.
Out[281]:
<matplotlib.collections.PathCollection at 0x182fdc7b5c0>
In [282]:
plt.scatter(yTest,y_pred)
Out[282]:
<matplotlib.collections.PathCollection at 0x182feccc390>
In [277]:
cdf = pd.DataFrame(lm.coef_, x.columns, columns=['coef'])
print(cdf)
cdf.to_csv(r'cdf.csv')
                 coef
STATE    3.621563e+07
TAVG     4.648156e+08
EMXT     2.189224e+07
EMNT    -2.722362e+07
DX90     9.994848e+06
DT32     2.485491e+07
PRCP    -8.860743e+05
SNOW     9.845560e+03
AWND     8.225139e+08
Corn     1.227393e+02
Soybean  1.344158e+00
In [278]:
labelencoder.inverse_transform(range(31))
cdf2 = pd.DataFrame(labelencoder.inverse_transform(range(31)),range(31),columns=['label'])
cdf2.to_csv(r'cdf2.csv')
In [100]:
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor(random_state=None)
clf.fit(X_train_one_hot,yTrain)
Out[100]:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
In [101]:
y_pred = clf.predict(X_test_one_hot)
print(metrics.r2_score(yTest,y_pred))
0.9820727160397184
In [102]:
y_pred2 = clf.predict(X_train_one_hot)
print(metrics.r2_score(yTrain,y_pred2))
1.0
In [103]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-103-301a91b38718> in <module>
      2 from IPython.display import Image
      3 from sklearn.tree import export_graphviz
----> 4 import pydotplus
      5 dot_data = StringIO()
      6 export_graphviz(clf, out_file=dot_data,  

ModuleNotFoundError: No module named 'pydotplus'
In [159]:
dta.head()
Out[159]:
Year STATE TMAX TMIN TAVG EMXT EMNT DX90 DT32 PRCP SNOW AWND Cattle Corn Soybean state_group
0 1988 3 23.512941 10.138222 16.757727 37.796078 -10.460000 67.568627 63.200000 1286.194118 19.876190 3.30 671520000.0 160000.0 14250000.0 AL
1 1989 3 22.942778 10.650000 16.725833 35.744444 -16.494000 44.814815 50.360000 1635.648624 4.766355 3.22 680350000.0 250000.0 11970000.0 AL
2 1990 3 25.110615 11.219516 18.175161 38.084615 -7.325806 84.200000 38.048387 1443.063415 0.024000 3.34 641115000.0 200000.0 7480000.0 AL
3 1991 3 23.856379 11.579630 17.725472 36.210345 -9.618519 60.551724 50.574074 1594.311304 1.595238 3.20 659250000.0 350000.0 8050000.0 AL
4 1992 3 22.826000 10.168000 16.454600 35.670909 -8.516000 34.636364 51.780000 1496.424000 20.207207 3.14 525310000.0 325000.0 7830000.0 AL
In [265]:
range(12)
Out[265]:
range(0, 12)

TensorFlow

In [288]:
x.head()
Out[288]:
STATE TAVG EMXT EMNT DX90 DT32 PRCP SNOW AWND Corn Soybean
0 0 16.757727 37.796078 -10.460000 67.568627 63.200000 1286.194118 19.876190 3.30 160000.0 14250000.0
1 0 16.725833 35.744444 -16.494000 44.814815 50.360000 1635.648624 4.766355 3.22 250000.0 11970000.0
2 0 18.175161 38.084615 -7.325806 84.200000 38.048387 1443.063415 0.024000 3.34 200000.0 7480000.0
3 0 17.725472 36.210345 -9.618519 60.551724 50.574074 1594.311304 1.595238 3.20 350000.0 8050000.0
4 0 16.454600 35.670909 -8.516000 34.636364 51.780000 1496.424000 20.207207 3.14 325000.0 7830000.0